library(haven)
library(mgcv)
library(gridExtra)
library(plyr)
library(tidyverse)
library(hardhat)
library(tidygam)
library(gKRLS)

# read in data frames from 2024

data_c9 <- read_csv("hps_04_02_09_puf.csv")
data_c8 <- read_csv("hps_04_02_08_puf.csv")
data_c7 <- read_csv("hps_04_01_07_puf.csv")
data_c6 <- read_csv("hps_04_01_06_puf.csv")
data_c5 <- read_csv("hps_04_01_05_puf.csv")
data_c4 <- read_csv("hps_04_01_04_puf.csv")
data_c3 <- read_csv("hps_04_00_03_puf.csv")
data_c2 <- read_csv("hps_04_00_02_puf.csv")
data_c1 <- read_csv("hps_04_00_01_puf.csv")

# append data frames from 2024

data_2024 <- rbind.fill(data_c1, data_c2, data_c3,
                        data_c4, data_c5, data_c6,
                        data_c7, data_c8, data_c9)

# delete separate dataframes
rm(data_c1, data_c2, data_c3,
           data_c4, data_c5, data_c6,
           data_c7, data_c8, data_c9)

# read data frames from 2023
data_63 <- read_csv("pulse2023_puf_63.csv")
data_62 <- read_csv("pulse2023_puf_62.csv")
data_61 <- read_csv("pulse2023_puf_61.csv")
data_60 <- read_csv("pulse2023_puf_60.csv")
data_59 <- read_csv("pulse2023_puf_59.csv")
data_58 <- read_csv("pulse2023_puf_58.csv")
data_57 <- read_csv("pulse2023_puf_57.csv")
data_56 <- read_csv("pulse2023_puf_56.csv")
data_55 <- read_csv("pulse2023_puf_55.csv")
data_54 <- read_csv("pulse2023_puf_54.csv")
data_53 <- read_csv("pulse2023_puf_53.csv")

# append data frames from 2023
data_2023 <- rbind.fill(data_63, data_62, data_61,
                   data_60, data_59, data_58,
                   data_57, data_56, data_55,
                   data_54, data_53)

# delete separate data frames from 2023
rm(data_63, data_62, data_61, data_60, data_59,
   data_58, data_57, data_56, data_55, data_54,
   data_53)

# read in data frames from 2022
data_52 <- read_csv("pulse2022_puf_52.csv")
data_51 <- read_csv("pulse2022_puf_51.csv")
data_50 <- read_csv("pulse2022_puf_50.csv")
data_49 <- read_csv("pulse2022_puf_49.csv")
data_48 <- read_csv("pulse2022_puf_48.csv")
data_47 <- read_csv("pulse2022_puf_47.csv")
data_46 <- read_csv("pulse2022_puf_46.csv")
data_45 <- read_csv("pulse2022_puf_45.csv")
data_44 <- read_csv("pulse2022_puf_44.csv")
data_43 <- read_csv("pulse2022_puf_43.csv")
data_42 <- read_csv("pulse2022_puf_42.csv")
data_41 <- read_csv("pulse2022_puf_41.csv")

# append dataframes from 2022
data_2022 <- rbind.fill(data_52, data_51, data_50, data_49,
                        data_48, data_47, data_46, data_45,
                        data_44, data_43, data_42, data_41)

# delete individual dataframes
rm(data_52, data_51, data_50, data_49,
   data_48, data_47, data_46, data_45,
   data_44, data_43, data_42, data_41)

# read in dataframes from 2021
data_40 <- read_csv("pulse2021_puf_40.csv")
data_39 <- read_csv("pulse2021_puf_39.csv")
data_38 <- read_csv("pulse2021_puf_38.csv")
data_37 <- read_csv("pulse2021_puf_37.csv")
data_36 <- read_csv("pulse2021_puf_36.csv")
data_35 <- read_csv("pulse2021_puf_35.csv")
data_34 <- read_csv("pulse2021_puf_34.csv")

# append dataframes from 2021
data_2021 <- rbind.fill(data_40, data_39, data_38, data_37,
                        data_36, data_35, data_34)

# delete separate dataframes
rm(data_40, data_39, data_38, data_37,
   data_36, data_35, data_34)

# append all dataframes
data_combined <- rbind.fill(data_2024, data_2023, data_2022, data_2021)

# delete unneeded dataframes
rm(data_2024, data_2023, data_2022, data_2021)

# use this value to normalize weights
k <- mean(data_combined$PWEIGHT)

# harmonize different variable names
clean_data <- data_combined %>%
  mutate(genid_q = if_else(is.na(GENID_DESCRIBE), RGENID_DESCRIBE, GENID_DESCRIBE)) %>%
  filter(genid_q != -88) # remove cases that were not asked gender identity

# delete redundant dataframe
rm(data_combined)

clean_data <- clean_data %>%# remove cases not asked about disability
  filter(SELFCARE != -88, MOBILITY != -88, SEEING != -88, HEARING != -88)

clean_data <- clean_data %>% 
  filter(genid_q != -99) # remove cases that saw and skipped gender identity

clean_data <- clean_data %>% # remove cases that saw and skipped disability
  filter(SELFCARE != -99, MOBILITY != -99, SEEING != -99, HEARING != -99) %>%
  filter(TBIRTH_YEAR > 1935) # remove bottom-coded values for birth year


clean_data <- clean_data %>%
  mutate(norm_weights = PWEIGHT/k, # normalize sampling weights
         age = case_when(!is.na(CYCLE) ~ 2024 - TBIRTH_YEAR, # get age from birth year
                         WEEK %in% c(53, 63) ~ 2023 - TBIRTH_YEAR,
                         WEEK %in% c(41, 52) ~ 2022 - TBIRTH_YEAR,
                         WEEK %in% c(34, 40) ~ 2021 - TBIRTH_YEAR),
         ethnicity = if_else(RHISPANIC == 2, "Hispanic", "Non-Hispanic"), # clean race, ethnicity, and education vars
         race = case_when(RRACE == 1 ~ "White",
                                    RRACE == 2 ~ "Black",
                                    RRACE == 3 ~ "Asian",
                                    RRACE == 4 ~ "Other/Multiracial"),
         edu = case_when(EEDUC %in% c(1,2) ~ "LTHS",
                         EEDUC == 3 ~ "HS grad",
                         EEDUC < 6 ~ "Some college",
                         EEDUC > 5 ~ "College grad"),
         afab = if_else(EGENID_BIRTH == 1, 0, 1), # indicator for assigned female at birth
         blind = case_when(SEEING %in% c(1,2) ~ 0, # indicators for each impairment
                           SEEING %in% c(3,4) ~ 1),
         deaf = case_when(HEARING %in% c(1,2) ~ 0,
                           HEARING %in% c(3,4) ~ 1),
         walk = case_when(MOBILITY %in% c(1,2) ~ 0,
                           MOBILITY %in% c(3,4) ~ 1),
         dress = case_when(SELFCARE %in% c(1,2) ~ 0,
                           SELFCARE %in% c(3,4) ~ 1))

clean_data <- clean_data %>%
  mutate(gender_id = case_when(afab == 1 & genid_q == 2 ~ "Cis woman", # detailed gender identity variable
                               afab == 0 & genid_q == 1 ~ "Cis man",
                               genid_q == 4 ~ "Gender Nonconforming",
                               afab == 1 ~ "Transmasc",
                               afab == 0 ~ "Transfem"),
         trans = case_when(afab == 1 & genid_q == 2 ~ 0, # binary trans/non-trans variable
                           afab == 0 & genid_q == 1 ~ 0,
                           TRUE ~ 1),
         dis_phys = case_when(blind + deaf + walk + dress > 0 ~ 1, # binary physical disability indicator
                              blind + deaf + walk + dress == 0 ~ 0))
  
clean_data <- clean_data %>% # change binary variables to factors
  mutate(dis_phys_fac = factor(dis_phys, levels = c(0,1), labels = c('No', 'Yes')),
         trans_fac = factor(trans, levels = c(0,1), labels = c('Non-trans', 'Trans')))

clean_data <- clean_data %>% # bin birth year into cohorts
  mutate(birth_year_binned = case_when(TBIRTH_YEAR < 1946 ~ "1945",
                                       TBIRTH_YEAR < 1956 ~ "1955",
                                       TBIRTH_YEAR < 1966 ~ "1965",
                                       TBIRTH_YEAR < 1976 ~ "1975",
                                       TBIRTH_YEAR < 1986 ~ "1985",
                                       TBIRTH_YEAR < 1996 ~ "1995",
                                       TBIRTH_YEAR < 2006 ~ "2005"),
         time_period_fac = factor(if_else(is.na(WEEK), CYCLE, WEEK)))

#Descriptive statistics (table A2)

library(modelsummary)
wtpct <- function(x, y) sum(x, na.rm = TRUE) / sum(y, na.rm = T) * 100

datasummary((`Birth cohort` = birth_year_binned) + (`Race` = race) + (`Ethnicity` = ethnicity) + (`Education` = edu) + (`Physical Disability` = dis_phys_fac) + 1 ~ (`Gender Identity` = gender_id)*(N + norm_weights*Percent(fn = wtpct, denom = "col")),
            data = clean_data,
            output = "hps_summary.html")

# fit model for figure 1
dis_phys_fit <- gam(trans ~ time_period_fac + s(TBIRTH_YEAR, by = dis_phys_fac, bs = "gKRLS"),
                   family = binomial(link = "logit"),
                   data = clean_data,
                   method = "REML",
                   weights = norm_weights)

# get variance covariance matrix
robust <- vcovCL(dis_phys_fit, cluster = clean_data$EST_ST)

# years to get predictions for
year_5_dbl <- c(1935, 1935, 1940, 1940, 1945, 1945, 1950, 1950, 1955, 1955, 1960, 1960,
                1965, 1965, 1970, 1970, 1975, 1975, 1980, 1980, 1985, 1985, 1990, 1990,
                1995, 1995, 2000, 2000, 2002, 2002, 2004, 2004, 2005, 2005)

# get predictions
dis_effects <- calculate_effects(dis_phys_fit,
                                 variables = c("TBIRTH_YEAR"),
                                 conditional = data.frame("dis_phys_fac" = c("Yes", "No"),
                                                          "TBIRTH_YEAR" = year_5_dbl,
                                                          "time_period_fac" = "9"),
                                 vcov = robust,
                                 continuous_type = "predict",
                                 verbose = T)

write_csv(dis_effects, "dis_effects_hps.csv")


## figure 4 (AFAB)

# run model
dis_phys_fit_afab <- gam(trans ~ time_period_fac + s(TBIRTH_YEAR, by = dis_phys_fac, bs = "gKRLS"),
                    family = binomial(link = "logit"),
                    data = clean_data[which(clean_data$afab == 1),],
                    method = "REML",
                    weights = norm_weights)

# get variance covariance matrix
robust_afab <- vcovCL(dis_phys_fit_afab, cluster = clean_data[which(clean_data$afab == 1),]$EST_ST)

# get predictions
dis_effects_afab <- calculate_effects(dis_phys_fit_afab,
                                 variables = c("TBIRTH_YEAR"),
                                 conditional = data.frame("dis_phys_fac" = c("Yes", "No"),
                                                          "TBIRTH_YEAR" = year_5_dbl,
                                                          "time_period_fac" = "9"),
                                 vcov = robust_afab,
                                 continuous_type = "predict",
                                 verbose = T)

write_csv(dis_effects_afab, "dis_effects_afab_hps.csv")


## Figure 4 (AMAB)

# run model
dis_phys_fit_amab <- gam(trans ~ time_period_fac + s(TBIRTH_YEAR, by = dis_phys_fac, bs = "gKRLS"),
                         family = binomial(link = "logit"),
                         data = clean_data[which(clean_data$afab == 0),],
                         method = "REML",
                         weights = norm_weights)

# get variance covariance matrix
robust_amab <- vcovCL(dis_phys_fit_amab, cluster = clean_data[which(clean_data$afab == 0),]$EST_ST)

# get predictions
dis_effects_amab <- calculate_effects(dis_phys_fit_amab,
                                      variables = c("TBIRTH_YEAR"),
                                      conditional = data.frame("dis_phys_fac" = c("Yes", "No"),
                                                               "TBIRTH_YEAR" = year_5_dbl,
                                                               "time_period_fac" = "9"),
                                      vcov = robust_amab,
                                      continuous_type = "predict",
                                      verbose = T)

write_csv(dis_effects_amab, "dis_effects_amab_hps.csv")

## Appendix B

# create more exclusive measure of trans identity
clean_data <- clean_data %>%
  mutate(trans_exc = case_when(trans == 0 ~ 0,
                               genid_q == 4 ~ 0,
                               trans == 1 ~ 1),
         trans_exc_fac = factor(trans_exc, levels = c(0,1), labels = c("Non-trans", "Trans")))

# fit model for figure B1
dis_phys_fit_b1 <- gam(trans_exc ~ time_period_fac + s(TBIRTH_YEAR, by = dis_phys_fac, bs = "gKRLS"),
                    family = binomial(link = "logit"),
                    data = clean_data,
                    method = "REML",
                    weights = norm_weights)

# get variance covariance matrix
robust_b1 <- vcovCL(dis_phys_fit_b1, cluster = clean_data$EST_ST)

# get predictions
dis_effects_b1 <- calculate_effects(dis_phys_fit_b1,
                                 variables = c("TBIRTH_YEAR"),
                                 conditional = data.frame("dis_phys_fac" = c("Yes", "No"),
                                                          "TBIRTH_YEAR" = year_5_dbl,
                                                          "time_period_fac" = "9"),
                                 vcov = robust,
                                 continuous_type = "predict",
                                 verbose = T)

write_csv(dis_effects_b1, "dis_effects_hps_app_b.csv")


## figure b2 (afab)

# run model
dis_phys_fit_afab_b2 <- gam(trans_exc ~ time_period_fac + s(TBIRTH_YEAR, by = dis_phys_fac, bs = "gKRLS"),
                         family = binomial(link = "logit"),
                         data = clean_data[which(clean_data$afab == 1),],
                         method = "REML",
                         weights = norm_weights)

# get variance covariance matrix
robust_afab_b2 <- vcovCL(dis_phys_fit_afab_b2, cluster = clean_data[which(clean_data$afab == 1),]$EST_ST)

# get predictions
dis_effects_afab_b2 <- calculate_effects(dis_phys_fit_afab_b2,
                                      variables = c("TBIRTH_YEAR"),
                                      conditional = data.frame("dis_phys_fac" = c("Yes", "No"),
                                                               "TBIRTH_YEAR" = year_5_dbl,
                                                               "time_period_fac" = "9"),
                                      vcov = robust_afab,
                                      continuous_type = "predict",
                                      verbose = T)

write_csv(dis_effects_afab, "dis_effects_afab_hps_app_b.csv")


## Figure b2 (amab)

# run model
dis_phys_fit_amab_b2 <- gam(trans_exc ~ time_period_fac + s(TBIRTH_YEAR, by = dis_phys_fac, bs = "gKRLS"),
                         family = binomial(link = "logit"),
                         data = clean_data[which(clean_data$afab == 0),],
                         method = "REML",
                         weights = norm_weights)

# get variance covariance matrix
robust_amab_b2 <- vcovCL(dis_phys_fit_amab_b2, cluster = clean_data[which(clean_data$afab == 0),]$EST_ST)

# get predictions
dis_effects_amab_b2 <- calculate_effects(dis_phys_fit_amab_b2,
                                      variables = c("TBIRTH_YEAR"),
                                      conditional = data.frame("dis_phys_fac" = c("Yes", "No"),
                                                               "TBIRTH_YEAR" = year_5_dbl,
                                                               "time_period_fac" = "9"),
                                      vcov = robust_amab,
                                      continuous_type = "predict",
                                      verbose = T)

write_csv(dis_effects_amab_b2, "dis_effects_amab_hps_app_b.csv")


## Appendix D

 clean_data <- clean_data %>% # create ordinal severity variable
  mutate(dis_severe_num = case_when(SEEING == 4 | HEARING == 4 | MOBILITY == 4 | SELFCARE == 4 ~ 4,
                                    SEEING == 3 | HEARING == 3 | MOBILITY == 3 | SELFCARE == 3 ~ 3,
                                    SEEING == 2 | HEARING == 2 | MOBILITY == 2 | SELFCARE == 2 ~ 2,
                                    dis_phys == 0 ~ 1),
         dis_severe_fac = factor(dis_severe_num,
                                 levels = c(1:4),
                                 labels = c("No difficulty", "Some difficulty",
                                            "A lot of difficulty", "Cannot do at all"))) %>%
  select(dis_severe_fac, trans, time_period_fac, TBIRTH_YEAR, norm_weights, EST_ST)

# fit model
severity_fit <- gam(trans ~ time_period_fac + s(TBIRTH_YEAR, by = dis_severe_fac, bs = "gKRLS"),
                    family = binomial(link = "logit"),
                    data = clean_data,
                    method = "REML",
                    weights = norm_weights)

# get var-covar matrix
robust_sev <- vcovCL(severity_fit, cluster = clean_data$EST_ST)

# years to predict
year_5_four <- c(1935, 1935, 1935, 1935, 1940, 1940, 1940, 1940, 1945, 1945, 1945, 1945,
                 1950, 1950, 1950, 1950, 1955, 1955, 1955, 1955, 1960, 1960, 1960, 1960,
                 1965, 1965, 1965, 1965, 1970, 1970, 1970, 1970, 1975, 1975, 1975, 1975,
                 1980, 1980, 1980, 1980, 1985, 1985, 1985, 1985, 1990, 1990, 1990, 1990,
                 1995, 1995, 1995, 1995, 2000, 2000, 2000, 2000, 2002, 2002, 2002, 2002,
                 2004, 2004, 2004, 2004, 2005, 2005, 2005, 2005)

# get predictions
dis_effects <- calculate_effects(severity_fit,
                                 variables = c("TBIRTH_YEAR"),
                                 conditional = data.frame("dis_severe_fac" = c("No difficulty", "Some difficulty",
                                                                               "A lot of difficulty", "Cannot do at all"),
                                                          "TBIRTH_YEAR" = year_5_four,
                                                          "time_period_fac" = "9"),
                                 vcov = robust_sev,
                                 continuous_type = "predict",
                                 verbose = T)

write_csv(dis_effects, "dis_effects_severe.csv")

